Rental listing New York city map

Load necessary packages

library(jsonlite)
library(dplyr)
library(ggplot2)
library(magrittr)
library(ggmap)
library(knitr)

Load training data

KAGGLE <- TRUE
train_path <- ifelse(KAGGLE, "../input/train.json", "data/train.json")
training <- fromJSON(train_path) %>% bind_rows 
# Keep list variables 
features <- training$features
photos <- training$photos
# Remove list variables from data
training$features <- NULL
training$photos <- NULL 
# Convert to data.frame
training <- sapply(training, unlist) %>%
  data.frame(., stringsAsFactors = FALSE)
# Add removed variables
training$features <- features
training$photos <- photos
# Clean memory
rm(features)
rm(photos)

Numerical and factor variables

numerical_variables <- c("bathrooms", "bedrooms", "longitude", "latitude", "price")
training[, numerical_variables] %<>% lapply(., as.numeric)
training$interest_level <- as.factor(training$interest_level)

Extra feature: distance to city center

# New York City Center Coords
ny_lat <- 40.785091; ny_lon <- -73.968285
# Alternate New York City Center Coords
ny_center <- geocode("new york", source = "google")
ny_center
##         lon      lat
## 1 -74.00594 40.71278
# Add Euclidean Distance to City Center
training$distance_city <- mapply(
      function(lon, lat) sqrt((lon - ny_lon)^2  + (lat - ny_lat)^2), 
      training$longitude, training$latitude) 

Density plot of distance

# Discard real state far from City Center
ny_outliners_dist <- 0.2
ggplot(training[training$distance_city < ny_outliners_dist, ], 
       aes(distance_city, color = interest_level)) + geom_density()

# Discard real state far from City Center
ggplot(training, aes(log(distance_city), color = interest_level)) + 
      geom_density()

NYC satellite map

map <- get_googlemap(zoom = 12,
  # Use Alternate New York City Center Coords
  center = ny_center %>% as.numeric,
  maptype = "satellite", sensor = FALSE)

p <- ggmap(map) + 
      geom_point(size = 1, data = training, 
                 aes(x = longitude, y = latitude, color = interest_level)) + 
      facet_grid(facets = . ~ interest_level, scales = "free", space = "free") +
      xlab("") + ylab("") + scale_colour_brewer(palette = "Set1")
p

Outliers: find real coordinates

outliers_addrs <- training[training$longitude == 0 | 
                                 training$latitude == 0, ]$street_address
outliers_addrs
##  [1] "145 28 Street"        "Van Sicklen street"   "219 E 28th"          
##  [4] "1632 Madison Ave"     "41-42 24th St "       "450 East 83rd Street"
##  [7] "247 west 87"          "118 W 109th"          "246 Mott St "        
## [10] "21 W 106th"           "338 e. 53"            "259 Decatur Street"
# addresses are supposed to be in nyc
outliers_ny <- paste(outliers_addrs, ", new york")
# search for geological location from google
outliers_addrs <- data.frame("street_address" = outliers_addrs)
coords <- sapply(outliers_ny,
                 function(x) geocode(x, source = "google")) %>%
  t %>%
  data.frame %>%
  cbind(outliers_addrs, .)
rownames(coords) <- 1:nrow(coords)
# Display table
kable(coords)
street_address lon lat
145 28 Street -73.99244 40.74708
Van Sicklen street -73.97504 40.59679
219 E 28th -73.97982 40.74179
1632 Madison Ave -73.94847 40.79576
41-42 24th St -73.94131 40.75153
450 East 83rd Street -73.94899 40.77399
247 west 87 -73.97555 40.78888
118 W 109th -73.96273 40.8015
246 Mott St -73.99466 40.72328
21 W 106th -73.96095 40.79874
338 e. 53 -73.96576 40.75591
259 Decatur Street -73.93344 40.68165

Update dataset

training[training$longitude == 0,]$longitude <- coords$lon
training[training$latitude == 0,]$latitude <- coords$lat

Wei Xu

2017-04-13